{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "61c92a7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.preprocessing import OrdinalEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "48221de5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "all_files = glob.glob(\"Storing_csv_files_2\" + \"/*.csv\")\n",
    "frame = []\n",
    "for filename in all_files:\n",
    "    df = pd.read_csv(filename)\n",
    "    frame.append(df)\n",
    "original_df = pd.concat(frame)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "175e5a68",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Summary</th>\n",
       "      <th>Issue id</th>\n",
       "      <th>Status</th>\n",
       "      <th>Assignee</th>\n",
       "      <th>Updated</th>\n",
       "      <th>Last Viewed</th>\n",
       "      <th>Original estimate</th>\n",
       "      <th>Time Spent</th>\n",
       "      <th>Custom field (Start date)</th>\n",
       "      <th>...</th>\n",
       "      <th>diff_time_spent</th>\n",
       "      <th>percent_random</th>\n",
       "      <th>percent_sign</th>\n",
       "      <th>percentage</th>\n",
       "      <th>Due date</th>\n",
       "      <th>Label</th>\n",
       "      <th>percent_random_time_spent</th>\n",
       "      <th>percent_sign_time_spent</th>\n",
       "      <th>percentage_time_spent</th>\n",
       "      <th>Random_time_spent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Problem statement</td>\n",
       "      <td>10018</td>\n",
       "      <td>Done</td>\n",
       "      <td>Alexey</td>\n",
       "      <td>3/7/2022 13:12</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>12</td>\n",
       "      <td>19.622701</td>\n",
       "      <td>2021-11-01</td>\n",
       "      <td>...</td>\n",
       "      <td>6</td>\n",
       "      <td>1.225434</td>\n",
       "      <td>2</td>\n",
       "      <td>2.450868</td>\n",
       "      <td>2022-03-02</td>\n",
       "      <td>0</td>\n",
       "      <td>1.135225</td>\n",
       "      <td>2</td>\n",
       "      <td>2.270450</td>\n",
       "      <td>19.622701</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Project Proposal</td>\n",
       "      <td>10019</td>\n",
       "      <td>Done</td>\n",
       "      <td>Angie</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>24</td>\n",
       "      <td>16.182582</td>\n",
       "      <td>2021-11-28</td>\n",
       "      <td>...</td>\n",
       "      <td>4</td>\n",
       "      <td>0.047835</td>\n",
       "      <td>-2</td>\n",
       "      <td>-0.095670</td>\n",
       "      <td>2021-12-31</td>\n",
       "      <td>0</td>\n",
       "      <td>1.522823</td>\n",
       "      <td>2</td>\n",
       "      <td>3.045645</td>\n",
       "      <td>16.182582</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>Assign project tasks to the team</td>\n",
       "      <td>10020</td>\n",
       "      <td>Done</td>\n",
       "      <td>Manoj</td>\n",
       "      <td>3/7/2022 13:13</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>2</td>\n",
       "      <td>3.853344</td>\n",
       "      <td>2021-11-30</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1.467797</td>\n",
       "      <td>2</td>\n",
       "      <td>2.935593</td>\n",
       "      <td>2022-04-23</td>\n",
       "      <td>0</td>\n",
       "      <td>1.426672</td>\n",
       "      <td>2</td>\n",
       "      <td>2.853344</td>\n",
       "      <td>3.853344</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Discuss budgeting</td>\n",
       "      <td>10021</td>\n",
       "      <td>Done</td>\n",
       "      <td>Violet</td>\n",
       "      <td>3/7/2022 13:04</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>4</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2021-12-01</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1.870390</td>\n",
       "      <td>2</td>\n",
       "      <td>3.740780</td>\n",
       "      <td>2022-05-17</td>\n",
       "      <td>0</td>\n",
       "      <td>0.991062</td>\n",
       "      <td>2</td>\n",
       "      <td>1.982123</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>Software Requirements Document</td>\n",
       "      <td>10022</td>\n",
       "      <td>Done</td>\n",
       "      <td>Latifah</td>\n",
       "      <td>3/7/2022 13:04</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>48</td>\n",
       "      <td>89.600950</td>\n",
       "      <td>2021-12-02</td>\n",
       "      <td>...</td>\n",
       "      <td>24</td>\n",
       "      <td>1.473269</td>\n",
       "      <td>2</td>\n",
       "      <td>2.946539</td>\n",
       "      <td>2022-04-25</td>\n",
       "      <td>0</td>\n",
       "      <td>1.366686</td>\n",
       "      <td>2</td>\n",
       "      <td>2.733373</td>\n",
       "      <td>89.600950</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>15</td>\n",
       "      <td>View Foreign Exchange account</td>\n",
       "      <td>10033</td>\n",
       "      <td>Done</td>\n",
       "      <td>Latifah</td>\n",
       "      <td>3/7/2022 13:07</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>36</td>\n",
       "      <td>-44.080124</td>\n",
       "      <td>2022-03-13</td>\n",
       "      <td>...</td>\n",
       "      <td>-12</td>\n",
       "      <td>1.637118</td>\n",
       "      <td>2</td>\n",
       "      <td>3.274236</td>\n",
       "      <td>2022-06-16</td>\n",
       "      <td>0</td>\n",
       "      <td>1.336672</td>\n",
       "      <td>2</td>\n",
       "      <td>2.673344</td>\n",
       "      <td>-44.080124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>16</td>\n",
       "      <td>Exchange currency tab</td>\n",
       "      <td>10034</td>\n",
       "      <td>Done</td>\n",
       "      <td>Latifah</td>\n",
       "      <td>3/7/2022 12:58</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>48</td>\n",
       "      <td>23.871948</td>\n",
       "      <td>2022-04-04</td>\n",
       "      <td>...</td>\n",
       "      <td>10</td>\n",
       "      <td>0.005432</td>\n",
       "      <td>-2</td>\n",
       "      <td>-0.010863</td>\n",
       "      <td>2022-04-20</td>\n",
       "      <td>0</td>\n",
       "      <td>0.693597</td>\n",
       "      <td>2</td>\n",
       "      <td>1.387195</td>\n",
       "      <td>23.871948</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>17</td>\n",
       "      <td>User testing</td>\n",
       "      <td>10035</td>\n",
       "      <td>Done</td>\n",
       "      <td>Sunny</td>\n",
       "      <td>3/7/2022 12:56</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>8</td>\n",
       "      <td>-4.324125</td>\n",
       "      <td>2022-04-15</td>\n",
       "      <td>...</td>\n",
       "      <td>-1</td>\n",
       "      <td>1.494751</td>\n",
       "      <td>2</td>\n",
       "      <td>2.989502</td>\n",
       "      <td>2022-06-29</td>\n",
       "      <td>0</td>\n",
       "      <td>1.662063</td>\n",
       "      <td>2</td>\n",
       "      <td>3.324125</td>\n",
       "      <td>-4.324125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>18</td>\n",
       "      <td>Vendors testing the software for any bugs</td>\n",
       "      <td>10036</td>\n",
       "      <td>Done</td>\n",
       "      <td>Terry/Rosemary/Lyte</td>\n",
       "      <td>3/7/2022 12:56</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>24</td>\n",
       "      <td>15.027167</td>\n",
       "      <td>2022-04-26</td>\n",
       "      <td>...</td>\n",
       "      <td>10</td>\n",
       "      <td>0.633597</td>\n",
       "      <td>2</td>\n",
       "      <td>1.267194</td>\n",
       "      <td>2022-06-24</td>\n",
       "      <td>0</td>\n",
       "      <td>0.251358</td>\n",
       "      <td>2</td>\n",
       "      <td>0.502717</td>\n",
       "      <td>15.027167</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>19</td>\n",
       "      <td>Releasing 2.0</td>\n",
       "      <td>10037</td>\n",
       "      <td>Done</td>\n",
       "      <td>Angie</td>\n",
       "      <td>3/7/2022 13:06</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>9</td>\n",
       "      <td>12.184136</td>\n",
       "      <td>2022-05-24</td>\n",
       "      <td>...</td>\n",
       "      <td>5</td>\n",
       "      <td>0.333757</td>\n",
       "      <td>2</td>\n",
       "      <td>0.667515</td>\n",
       "      <td>2022-07-22</td>\n",
       "      <td>0</td>\n",
       "      <td>0.718414</td>\n",
       "      <td>2</td>\n",
       "      <td>1.436827</td>\n",
       "      <td>12.184136</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>20000 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                    Summary  Issue id Status  \\\n",
       "0            0                          Problem statement     10018   Done   \n",
       "1            1                           Project Proposal     10019   Done   \n",
       "2            2           Assign project tasks to the team     10020   Done   \n",
       "3            3                          Discuss budgeting     10021   Done   \n",
       "4            4             Software Requirements Document     10022   Done   \n",
       "..         ...                                        ...       ...    ...   \n",
       "15          15              View Foreign Exchange account     10033   Done   \n",
       "16          16                      Exchange currency tab     10034   Done   \n",
       "17          17                               User testing     10035   Done   \n",
       "18          18  Vendors testing the software for any bugs     10036   Done   \n",
       "19          19                              Releasing 2.0     10037   Done   \n",
       "\n",
       "               Assignee         Updated     Last Viewed  Original estimate  \\\n",
       "0                Alexey  3/7/2022 13:12  3/7/2022 13:14                 12   \n",
       "1                 Angie             NaN             NaN                 24   \n",
       "2                 Manoj  3/7/2022 13:13  3/7/2022 13:14                  2   \n",
       "3                Violet  3/7/2022 13:04  3/7/2022 13:14                  4   \n",
       "4               Latifah  3/7/2022 13:04  3/7/2022 13:14                 48   \n",
       "..                  ...             ...             ...                ...   \n",
       "15              Latifah  3/7/2022 13:07  3/7/2022 13:14                 36   \n",
       "16              Latifah  3/7/2022 12:58  3/7/2022 13:14                 48   \n",
       "17                Sunny  3/7/2022 12:56  3/7/2022 13:14                  8   \n",
       "18  Terry/Rosemary/Lyte  3/7/2022 12:56  3/7/2022 13:14                 24   \n",
       "19                Angie  3/7/2022 13:06  3/7/2022 13:14                  9   \n",
       "\n",
       "    Time Spent Custom field (Start date)  ...  diff_time_spent percent_random  \\\n",
       "0    19.622701                2021-11-01  ...                6       1.225434   \n",
       "1    16.182582                2021-11-28  ...                4       0.047835   \n",
       "2     3.853344                2021-11-30  ...                1       1.467797   \n",
       "3     0.000000                2021-12-01  ...                0       1.870390   \n",
       "4    89.600950                2021-12-02  ...               24       1.473269   \n",
       "..         ...                       ...  ...              ...            ...   \n",
       "15  -44.080124                2022-03-13  ...              -12       1.637118   \n",
       "16   23.871948                2022-04-04  ...               10       0.005432   \n",
       "17   -4.324125                2022-04-15  ...               -1       1.494751   \n",
       "18   15.027167                2022-04-26  ...               10       0.633597   \n",
       "19   12.184136                2022-05-24  ...                5       0.333757   \n",
       "\n",
       "    percent_sign  percentage    Due date  Label  percent_random_time_spent  \\\n",
       "0              2    2.450868  2022-03-02      0                   1.135225   \n",
       "1             -2   -0.095670  2021-12-31      0                   1.522823   \n",
       "2              2    2.935593  2022-04-23      0                   1.426672   \n",
       "3              2    3.740780  2022-05-17      0                   0.991062   \n",
       "4              2    2.946539  2022-04-25      0                   1.366686   \n",
       "..           ...         ...         ...    ...                        ...   \n",
       "15             2    3.274236  2022-06-16      0                   1.336672   \n",
       "16            -2   -0.010863  2022-04-20      0                   0.693597   \n",
       "17             2    2.989502  2022-06-29      0                   1.662063   \n",
       "18             2    1.267194  2022-06-24      0                   0.251358   \n",
       "19             2    0.667515  2022-07-22      0                   0.718414   \n",
       "\n",
       "   percent_sign_time_spent  percentage_time_spent  Random_time_spent  \n",
       "0                        2               2.270450          19.622701  \n",
       "1                        2               3.045645          16.182582  \n",
       "2                        2               2.853344           3.853344  \n",
       "3                        2               1.982123           0.000000  \n",
       "4                        2               2.733373          89.600950  \n",
       "..                     ...                    ...                ...  \n",
       "15                       2               2.673344         -44.080124  \n",
       "16                       2               1.387195          23.871948  \n",
       "17                       2               3.324125          -4.324125  \n",
       "18                       2               0.502717          15.027167  \n",
       "19                       2               1.436827          12.184136  \n",
       "\n",
       "[20000 rows x 23 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "original_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ed0098f0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Status             Assignee\n",
      "0    Done               Alexey\n",
      "1    Done                Angie\n",
      "2    Done                Manoj\n",
      "3    Done               Violet\n",
      "4    Done              Latifah\n",
      "..    ...                  ...\n",
      "15   Done              Latifah\n",
      "16   Done              Latifah\n",
      "17   Done                Sunny\n",
      "18   Done  Terry/Rosemary/Lyte\n",
      "19   Done                Angie\n",
      "\n",
      "[20000 rows x 2 columns]\n",
      "[[ 0.  1.]\n",
      " [ 0.  2.]\n",
      " [ 0.  8.]\n",
      " ...\n",
      " [ 0. 11.]\n",
      " [ 0. 12.]\n",
      " [ 0.  2.]]\n"
     ]
    }
   ],
   "source": [
    "original_df['Status'].fillna('Done', inplace = True)\n",
    "transform_df = original_df[['Status','Assignee']]\n",
    "print(transform_df)\n",
    "\n",
    "encoder = OrdinalEncoder()\n",
    "encoder.fit(transform_df)\n",
    "final_df = encoder.transform(transform_df)\n",
    "print(final_df)\n",
    "#final_df = pd.DataFrame(final_df, columns = ['Status', 'Label(Binary) Project with all tasks', 'Assignee' ])\n",
    "#final_df = pd.concat([final_df, original_df], axis = 1, ignore_indexing = True)\n",
    "#final_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c694a3c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = original_df[['diff_dates','percent_random', 'Original estimate', 'diff_time_spent', 'Time Spent']].to_numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "42d3331d",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = original_df[['diff_dates','percent_random', 'Original estimate', 'diff_time_spent', 'Time Spent', 'Label']]\n",
    "y = original_df['Label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "8da86de6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Meep\\AppData\\Local\\Temp/ipykernel_6912/1957488410.py:5: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  X['Label'] = le.fit_transform(X['Label'])\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "le = LabelEncoder()\n",
    "\n",
    "X['Label'] = le.fit_transform(X['Label'])\n",
    "\n",
    "y = le.transform(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a7e782ad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 20000 entries, 0 to 19\n",
      "Data columns (total 23 columns):\n",
      " #   Column                                Non-Null Count  Dtype  \n",
      "---  ------                                --------------  -----  \n",
      " 0   Unnamed: 0                            20000 non-null  int64  \n",
      " 1   Summary                               20000 non-null  object \n",
      " 2   Issue id                              20000 non-null  int64  \n",
      " 3   Status                                20000 non-null  object \n",
      " 4   Assignee                              20000 non-null  object \n",
      " 5   Updated                               19000 non-null  object \n",
      " 6   Last Viewed                           19000 non-null  object \n",
      " 7   Original estimate                     20000 non-null  int64  \n",
      " 8   Time Spent                            20000 non-null  float64\n",
      " 9   Custom field (Start date)             20000 non-null  object \n",
      " 10  Dependency                            19000 non-null  float64\n",
      " 11  Label(Binary) Project with all tasks  20000 non-null  object \n",
      " 12  diff_dates                            20000 non-null  float64\n",
      " 13  diff_time_spent                       20000 non-null  int64  \n",
      " 14  percent_random                        20000 non-null  float64\n",
      " 15  percent_sign                          20000 non-null  int64  \n",
      " 16  percentage                            20000 non-null  float64\n",
      " 17  Due date                              20000 non-null  object \n",
      " 18  Label                                 20000 non-null  int64  \n",
      " 19  percent_random_time_spent             20000 non-null  float64\n",
      " 20  percent_sign_time_spent               20000 non-null  int64  \n",
      " 21  percentage_time_spent                 20000 non-null  float64\n",
      " 22  Random_time_spent                     20000 non-null  float64\n",
      "dtypes: float64(8), int64(7), object(8)\n",
      "memory usage: 3.7+ MB\n"
     ]
    }
   ],
   "source": [
    "X.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "3d23c239",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Summary</th>\n",
       "      <th>Issue id</th>\n",
       "      <th>Status</th>\n",
       "      <th>Assignee</th>\n",
       "      <th>Updated</th>\n",
       "      <th>Last Viewed</th>\n",
       "      <th>Original estimate</th>\n",
       "      <th>Time Spent</th>\n",
       "      <th>Custom field (Start date)</th>\n",
       "      <th>...</th>\n",
       "      <th>diff_time_spent</th>\n",
       "      <th>percent_random</th>\n",
       "      <th>percent_sign</th>\n",
       "      <th>percentage</th>\n",
       "      <th>Due date</th>\n",
       "      <th>Label</th>\n",
       "      <th>percent_random_time_spent</th>\n",
       "      <th>percent_sign_time_spent</th>\n",
       "      <th>percentage_time_spent</th>\n",
       "      <th>Random_time_spent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Problem statement</td>\n",
       "      <td>10018</td>\n",
       "      <td>Done</td>\n",
       "      <td>Alexey</td>\n",
       "      <td>3/7/2022 13:12</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>12</td>\n",
       "      <td>19.622701</td>\n",
       "      <td>2021-11-01</td>\n",
       "      <td>...</td>\n",
       "      <td>6</td>\n",
       "      <td>1.225434</td>\n",
       "      <td>2</td>\n",
       "      <td>2.450868</td>\n",
       "      <td>2022-03-02</td>\n",
       "      <td>0</td>\n",
       "      <td>1.135225</td>\n",
       "      <td>2</td>\n",
       "      <td>2.270450</td>\n",
       "      <td>19.622701</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Project Proposal</td>\n",
       "      <td>10019</td>\n",
       "      <td>Done</td>\n",
       "      <td>Angie</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>24</td>\n",
       "      <td>16.182582</td>\n",
       "      <td>2021-11-28</td>\n",
       "      <td>...</td>\n",
       "      <td>4</td>\n",
       "      <td>0.047835</td>\n",
       "      <td>-2</td>\n",
       "      <td>-0.095670</td>\n",
       "      <td>2021-12-31</td>\n",
       "      <td>0</td>\n",
       "      <td>1.522823</td>\n",
       "      <td>2</td>\n",
       "      <td>3.045645</td>\n",
       "      <td>16.182582</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>Assign project tasks to the team</td>\n",
       "      <td>10020</td>\n",
       "      <td>Done</td>\n",
       "      <td>Manoj</td>\n",
       "      <td>3/7/2022 13:13</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>2</td>\n",
       "      <td>3.853344</td>\n",
       "      <td>2021-11-30</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1.467797</td>\n",
       "      <td>2</td>\n",
       "      <td>2.935593</td>\n",
       "      <td>2022-04-23</td>\n",
       "      <td>0</td>\n",
       "      <td>1.426672</td>\n",
       "      <td>2</td>\n",
       "      <td>2.853344</td>\n",
       "      <td>3.853344</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Discuss budgeting</td>\n",
       "      <td>10021</td>\n",
       "      <td>Done</td>\n",
       "      <td>Violet</td>\n",
       "      <td>3/7/2022 13:04</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>4</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2021-12-01</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1.870390</td>\n",
       "      <td>2</td>\n",
       "      <td>3.740780</td>\n",
       "      <td>2022-05-17</td>\n",
       "      <td>0</td>\n",
       "      <td>0.991062</td>\n",
       "      <td>2</td>\n",
       "      <td>1.982123</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>Software Requirements Document</td>\n",
       "      <td>10022</td>\n",
       "      <td>Done</td>\n",
       "      <td>Latifah</td>\n",
       "      <td>3/7/2022 13:04</td>\n",
       "      <td>3/7/2022 13:14</td>\n",
       "      <td>48</td>\n",
       "      <td>89.600950</td>\n",
       "      <td>2021-12-02</td>\n",
       "      <td>...</td>\n",
       "      <td>24</td>\n",
       "      <td>1.473269</td>\n",
       "      <td>2</td>\n",
       "      <td>2.946539</td>\n",
       "      <td>2022-04-25</td>\n",
       "      <td>0</td>\n",
       "      <td>1.366686</td>\n",
       "      <td>2</td>\n",
       "      <td>2.733373</td>\n",
       "      <td>89.600950</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0                           Summary  Issue id Status Assignee  \\\n",
       "0           0                 Problem statement     10018   Done   Alexey   \n",
       "1           1                  Project Proposal     10019   Done    Angie   \n",
       "2           2  Assign project tasks to the team     10020   Done    Manoj   \n",
       "3           3                 Discuss budgeting     10021   Done   Violet   \n",
       "4           4    Software Requirements Document     10022   Done  Latifah   \n",
       "\n",
       "          Updated     Last Viewed  Original estimate  Time Spent  \\\n",
       "0  3/7/2022 13:12  3/7/2022 13:14                 12   19.622701   \n",
       "1             NaN             NaN                 24   16.182582   \n",
       "2  3/7/2022 13:13  3/7/2022 13:14                  2    3.853344   \n",
       "3  3/7/2022 13:04  3/7/2022 13:14                  4    0.000000   \n",
       "4  3/7/2022 13:04  3/7/2022 13:14                 48   89.600950   \n",
       "\n",
       "  Custom field (Start date)  ...  diff_time_spent percent_random  \\\n",
       "0                2021-11-01  ...                6       1.225434   \n",
       "1                2021-11-28  ...                4       0.047835   \n",
       "2                2021-11-30  ...                1       1.467797   \n",
       "3                2021-12-01  ...                0       1.870390   \n",
       "4                2021-12-02  ...               24       1.473269   \n",
       "\n",
       "   percent_sign  percentage    Due date  Label  percent_random_time_spent  \\\n",
       "0             2    2.450868  2022-03-02      0                   1.135225   \n",
       "1            -2   -0.095670  2021-12-31      0                   1.522823   \n",
       "2             2    2.935593  2022-04-23      0                   1.426672   \n",
       "3             2    3.740780  2022-05-17      0                   0.991062   \n",
       "4             2    2.946539  2022-04-25      0                   1.366686   \n",
       "\n",
       "  percent_sign_time_spent  percentage_time_spent  Random_time_spent  \n",
       "0                       2               2.270450          19.622701  \n",
       "1                       2               3.045645          16.182582  \n",
       "2                       2               2.853344           3.853344  \n",
       "3                       2               1.982123           0.000000  \n",
       "4                       2               2.733373          89.600950  \n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "ba24bb87",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KMeans(n_clusters=2, random_state=100)"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "\n",
    "kmeans = KMeans(n_clusters=2, random_state=100) \n",
    "\n",
    "kmeans.fit(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "80b6652a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.92096063e+01, 9.96420285e-01, 2.01382854e+01, 4.79033789e+00,\n",
       "        1.21294420e+01, 1.12203295e-01],\n",
       "       [1.76859189e+01, 1.00767017e+00, 7.84840095e+01, 4.73646778e+01,\n",
       "        1.58562271e+02, 9.54653938e-03]])"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kmeans.cluster_centers_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "eed91279",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Result: 15916 out of 20000 samples were correctly labeled.\n"
     ]
    }
   ],
   "source": [
    "labels = kmeans.labels_\n",
    "\n",
    "# check how many of the samples were correctly labeled\n",
    "correct_labels = sum(y == labels)\n",
    "\n",
    "print(\"Result: %d out of %d samples were correctly labeled.\" % (correct_labels, y.size))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "730019fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy score: 0.80\n"
     ]
    }
   ],
   "source": [
    "print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7aab85bd",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
